Introduction

In this project we use a dataset of wine reviews to predict review points from numerical, categorical and textual predictors.

The data is from Kaggle Datasets, and covers 150k wine reviews along with some attributes of the wines. It can be found here. (A (free) Kaggle login is required to access it directly from kaggle.com). The data was originally scraped from WineEnthusiast.

The dataset contains the following columns:

This is a particularly interesting problem for several reasons:

Methods

Feature Engineering

### Add 'continentTopFive' feature
wine = data.frame(wine, continentTopFive = getContinent_withTop5(wine$country))
#levels(wine$continentTopFive)
ggplot(wine, aes(x=log(price), y=points)) + 
  geom_point(aes(col = continentTopFive)) +stat_summary(fun.data=mean_cl_normal) + 
    stat_smooth(aes(colour=continentTopFive),method="lm",se = FALSE) + 
            scale_colour_manual(values = c("red","green", "blue", "orange", "dodger blue", "violet", "dark green")) + ggtitle("Points versus Log(price), by ContinentTopFive") + theme(plot.title = element_text(hjust = 0.5)) + labs(x="log(price)",y="points")

### Perform LDA and sentiment analysis on the description

corpus = tm::Corpus(tm::VectorSource(as.character(wine$description)))
## transform corpus: remove white space and punctuation, transform to lower case, remove stop words, stem.
corpus = tm::tm_map(corpus, stripWhitespace)
corpus = tm::tm_map(corpus, removePunctuation)
corpus = tm::tm_map(corpus, content_transformer(tolower))
corpus = tm::tm_map(corpus, removeWords, stopwords("english"))
# corpus = tm_map(corpus, removeWords, c("wine"))
corpus = tm::tm_map(corpus, stemDocument)
## create DocumentTerm matrix for LDA
corpus_dtm = DocumentTermMatrix(corpus)

NUM_TOPICS = 2

corpus_lda = topicmodels::LDA(corpus_dtm, k = NUM_TOPICS, control = list(seed = 1234))
corpus_documents = tidy(corpus_lda, matrix = "gamma")

SENTIMENT_ENGINE = "bing"
## Derive sentiment score
corpus_sentiments <- tidy(corpus_dtm) %>%
  inner_join(get_sentiments(SENTIMENT_ENGINE), by = c(term = "word")) %>%
  count(document, sentiment, wt = count) %>%
  ungroup() %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative) %>%
  arrange(sentiment)
## introduce new predictors
wine = data.frame(wine, topic1 = corpus_documents$gamma[1:nrow(wine)], document = rownames(wine))
wine = merge(wine, corpus_sentiments[,c(1,4)], by = "document")
wine = wine[complete.cases(wine[, c("points","price", "continentTopFive", "topic1", "sentiment")]), c("points","price", "continentTopFive", "topic1", "sentiment")]
names(wine)
## [1] "points"           "price"            "continentTopFive" "topic1"          
## [5] "sentiment"

Training and Test Set Generation

We shuffled our data, then created an 80% training, 20% test split for later use in cross validation. We did 10 fold cross validation of every model.

## REMOVE TEST SET = 20% of data.
#Randomly shuffle the data
wine = wine[sample(nrow(wine)),]

# Sample 80% to training, 20% to test set.
smp_size = floor(0.80 * nrow(wine))
train_ind = sample(seq_len(nrow(wine)), size = smp_size)
wine_train = wine[train_ind, ]
wine_test = wine[-train_ind, ]
# unit test the partitioning
nrow(wine) == nrow(wine_test) + nrow(wine_train)
## [1] TRUE

Model Selection

Model #1

mod = lm(log(points-79.999) ~ log(price)*continentTopFive*sentiment*topic1 + I(log(price)^2) + I(log(price)^3), data=wine_train)
select = step(mod, direction="backward", k=log(nrow(wine_train)), trace=FALSE)

# remove outliers and highly influential points and refit the *select*ed model
keep = which(abs(resid(select)) <= 7 & cooks.distance(select) <= 4 / nrow(wine_train))

select_formula1 = log(points - 79.999) ~ log(price) + continentTopFive + topic1 + I(log(price)^2) + I(log(price)^3) + log(price):continentTopFive + continentTopFive:topic1
mod1 = lm(select_formula1, data=wine_train, subset=keep)

# Plot details and perform tests for homoscedasticity and normality of errors.
plot(mod1)

bptest(mod1)
## 
##  studentized Breusch-Pagan test
## 
## data:  mod1
## BP = 3000, df = 22, p-value <2e-16
shapiro.test(sample(resid(mod1),5000))
## 
##  Shapiro-Wilk normality test
## 
## data:  sample(resid(mod1), 5000)
## W = 0.91, p-value <2e-16
wine_train_cv = wine_train[keep,]

RMSE1 = cv_mean_rmse(select_formula1, wine_train_cv, 10, function(x) {exp(x) + 79.999} )

Model 2

mod = lm(points ~ log(price)*continentTopFive*sentiment*topic1 + I(log(price)^2) + I(log(price)^3), data=wine_train)
select = step(mod, direction="backward", k=log(nrow(wine_train)), trace=FALSE)
# plot(select)
# bptest(select)
# shapiro.test(sample(resid(select),5000))

#remove highly influential points
keep = which(cooks.distance(select) <= 4 / nrow(wine_train))
select_formula2 = points ~ log(price) + continentTopFive + topic1 + 
    I(log(price)^2) + I(log(price)^3) + log(price):continentTopFive + 
    log(price):topic1 + continentTopFive:topic1
mod2 = lm(select_formula2, data=wine_train, subset=keep)
plot(mod2)

bptest(mod2)
## 
##  studentized Breusch-Pagan test
## 
## data:  mod2
## BP = 6200, df = 23, p-value <2e-16
shapiro.test(sample(resid(mod2),5000))
## 
##  Shapiro-Wilk normality test
## 
## data:  sample(resid(mod2), 5000)
## W = 1, p-value = 4e-06
wine_train_cv = wine_train[keep,]

RMSE2 = cv_mean_rmse(select_formula2, wine_train_cv, 10)

Model 3

bc_mod = caret::BoxCoxTrans(wine_train$points)

wine_train = cbind(wine_train, bc_points = predict(bc_mod, wine_train$points))
mod = lm(bc_points ~ log(price)*continentTopFive*sentiment*topic1 + I(log(price)^2) + I(log(price)^3), data=wine_train)
select = step(mod, direction="backward", k=log(nrow(wine_train)), trace=FALSE)

keep = cooks.distance(mod) <= 4 / nrow(wine_train)
select_formula3 = bc_points ~ log(price) + continentTopFive + topic1 + 
    I(log(price)^2) + I(log(price)^3) + log(price):continentTopFive + 
    log(price):topic1 + continentTopFive:topic1

mod3 = lm(select_formula3, data=wine_train, subset = keep)
plot(mod3)

bptest(mod3)
## 
##  studentized Breusch-Pagan test
## 
## data:  mod3
## BP = 5000, df = 23, p-value <2e-16
shapiro.test(sample(resid(mod3),5000))
## 
##  Shapiro-Wilk normality test
## 
## data:  sample(resid(mod3), 5000)
## W = 0.99, p-value = 2e-14
### Perform 10 fold cross validation

#Randomly shuffle the training set and create 10 folds
wine_train_cv = wine_train[keep,]

# see section 13.1.2 at http://daviddalpiaz.github.io/appliedstats/transformations.html#response-transformation
RMSE3 = cv_mean_rmse(select_formula3, wine_train_cv, 10, function(x) {(x*bc_mod$lambda + 1)^(1/bc_mod$lambda)})

Model 4

mod = lm(points ~ log(price)*continentTopFive*sentiment*topic1 + I(log(price)^2) + I(log(price)^3), data=wine_train, subset = keep, weights = sqrt(1 / 1 + abs(points -90)))
select = step(mod, direction="backward", k=log(nrow(wine_train)), trace=FALSE)
keep = cooks.distance(mod) <= 4 / nrow(wine_train)

select_formula4 = points ~ log(price) + continentTopFive + topic1 + 
    I(log(price)^2) + I(log(price)^3) + log(price):continentTopFive + 
    log(price):topic1 + continentTopFive:topic1
mod4 = lm(select_formula4, data=wine_train, subset = keep, weights = sqrt(1 / 1 + abs(points -90)))

plot(mod4)

bptest(mod4)
## 
##  studentized Breusch-Pagan test
## 
## data:  mod4
## BP = 4500, df = 23, p-value <2e-16
shapiro.test(sample(resid(mod4),5000))
## 
##  Shapiro-Wilk normality test
## 
## data:  sample(resid(mod4), 5000)
## W = 1, p-value = 4e-06
### Perform 10 fold cross validation

#Randomly shuffle the training set and create 10 folds
wine_train_cv = wine_train[keep,]

RMSE4 = cv_mean_rmse(select_formula4, wine_train_cv, 10)

Model 5

mod = lm(points ~ log(price)*continentTopFive*sentiment*topic1 + I(log(price)^2) + I(log(price)^3) +
           I(log(price)^4) + I(log(price)^5)  + I(log(price)^6) + I(log(price)^7), data=wine_train)
select = step(mod, direction="backward", k=log(nrow(wine_train)), trace=FALSE)

#remove highly influential points
keep = which(cooks.distance(select) <= 4 / nrow(wine_train))

#update
select_formula5 = points ~ log(price) + continentTopFive + topic1 + I(log(price)^2) + I(log(price)^3) +
  I(log(price)^4) + I(log(price)^5) + I(log(price)^6) + I(log(price)^7) + log(price):continentTopFive +
  log(price):topic1 + continentTopFive:topic1
mod5 = lm(select_formula5, data=wine_train, subset=keep)
plot(mod5)

bptest(mod5)
## 
##  studentized Breusch-Pagan test
## 
## data:  mod5
## BP = 6100, df = 27, p-value <2e-16
shapiro.test(sample(resid(mod5),5000))
## 
##  Shapiro-Wilk normality test
## 
## data:  sample(resid(mod5), 5000)
## W = 0.99, p-value = 4e-13
wine_train_cv = wine_train[keep,]

RMSE5 = cv_mean_rmse(select_formula5, wine_train_cv, 10)

Results

##        CV RMSE
## model1   2.521
## model2   2.432
## model3   2.447
## model4   2.533
## model5   2.436
preds = predict(mod2, newdata=wine_test)
success_ratio = sum(wine_test$points >= preds - 2*best_cv_rmse & wine_test$points <= preds + 2*best_cv_rmse) / nrow(wine_test)

Discussion

We used four different strategies to attempt to meet the assumptions of multiple linear regression and minimize cross-validated RMSE. Attempts include transforming response with BoxCox and log, integrated polynomial models over the engineered features, and weighted least squares regression. In the end, the non-transformed response model had the lowest cross-validated RMSE with a value of 2.4317.

Normality of the errors and homoscedasticity are suspect, as demonstrated by the Breusch-Pagan and Shapiro-Wilk tests and our plots. (Our low p-value on the Shapiro-Wilk tests indicates that our normality assumption is likely to be violated, and our low p-value on the Breusch-Pagan tests indicates that our equal variance assumption is likely violated. Similarly, the patterns we see on our Q-Q plots are indicative of heavy tails, and the patterns on our fitted vs. residuals plots indicate heteroskedasticity, as the variance is clearly largest around 90 and tapers to the extremes.) We tried many attempts to get normality and homoskedasticity (including additional less interesting attempts not shown here), but were not successful. (However, note that because Shapiro-Wilk in R can take only a maximum of 5000 residuals, sampling is required, and this test is then dependent on the choice of random seed used in the sampling. We found that if we moved the random seed around the test would sometimes pass.)

Our goal here was prediction more than inference, so we’re not too concerned with model complexity here – we are aiming for predictive power. We get around the normality and homoskedastic assumptions by using cross-validated RMSE to create our prediction intervals. Doing so gives a 94% success rate on the withheld test set with a +/- 4.8 point prediction interval (i.e. this percentage of the withheld test set points were within the prediction interval of Model #2).

A note on model 4: weighted least squares was performed, with weight being proportional to the inverse of the observed variance of points. Since our observed variance of residuals peaked at points = 90 and tapered to the ends of the points range, the weight function assumes its minimum at points = 90 and increases at the ends of the range of points. The weight function used was \(weights = sqrt(1 / 1 + abs(points -90))\).

Due to time constraints we were limited on creating more topics and evaluating their impact on the models. We did attempt to create LDA with 3 topics, but it didn’t show as much improvement in the model performance.

Potential for future improvements: We limited ourselves to regression due to the nature of the course, but techniques such as random forests are an obvious next step. Additionally, this project could potentially be further expanded in the future by using deeper NLP, more LDA topics, additional predictors, and expanded model selection attempts. It may also be the case that the data is noisy, with a high base \(\epsilon\) variance of points no matter which predictors are used (subjective human judgement after all).

Appendix

About the LDA

corpus_topics = tidy(corpus_lda, matrix = "beta")
#corpus_topics
corpus_top_terms = corpus_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

corpus_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) + ggtitle("Word Probabilities for the 2 Topics") +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

beta_spread = corpus_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic1 > .001 | topic2 > .001) %>%
  mutate(log_ratio = log2(topic2 / topic1))
#beta_spread

## Plot the beta spread of the topics
ggplot(beta_spread[order(abs(beta_spread$log_ratio)),][1:30,], aes(x=term, y=log_ratio)) + geom_bar(stat="identity", fill="green", width=.2) + coord_flip() + ggtitle("Log ratio of word probabilities in topic2/topic1")